import os
import re
from multiprocessing.dummy import Pool
from lxml import etree
import requests


url = 'https://www.pearvideo.com/category_8' # 要爬取的分类页面
video_base_url = 'https://www.pearvideo.com/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}

# 定义线程池
download_pool = Pool(5)
save_pool = Pool(5)

# 创建保存视频的目录
video_category = '视频'
if not os.path.exists(video_category):
    os.mkdir('视频')


# 主页面  
page_text = requests.get(url=url, headers = headers).text
tree = etree.HTML(page_text)
title_list = tree.xpath('//div[@class="vervideo-title"]//text()') # 标题列表
video_suffix_list = tree.xpath('//div[@class="vervideo-bd"]/a/@href') # 标题url后缀列表 video_1628826


# real_url_list = [] # 获取到的视频地址列表
# file_name_list = [] # 文件名列表
file_url = []
for video_suffix in video_suffix_list:
    video_url = video_base_url + video_suffix # 视频详情页 https://www.pearvideo.com/video_1629045，从中提取视频的url
    # 请求详情页
    video_detail_text = requests.get(url=video_url, headers=headers).text
    v_tree = etree.HTML(video_detail_text)
    title = v_tree.xpath('//div[@class="video-tt-box"]//h1//text()')[0] +'.mp4' # 罗永浩再创业，叫卖是鲨纹儿童书包
    title = title.replace('？','').replace('：','') # 替换掉文件名中特殊符号
    file_name = os.path.join(video_category,title)
    pattern = 'srcUrl="(.*?)",vdoUrl' # 从页面js中匹配真实的视频url
    real_video_url = re.findall(pattern, video_detail_text, re.S)[0] # srcUrl="https://video.pearvideo.com/mp4/adshort/20191203/cont-1629045-14659742_adpkg-ad_hd.mp4",vdoUrl
    file_url.append({'name':file_name, 'url':real_video_url}) # 将文件名，url对应存入字典

def downloader(file_url):
    video_content = requests.get(url=file_url['url'], headers=headers).content # 视频内容
#     print(file_url['name'],'获取成功')
    with open(file_url['name'], 'wb') as fp:
        fp.write(video_content)
#         print(file_url['name'],'保存成功')
   
run = download_pool.map(downloader, file_url)

